Corpus description and selection
Load data
# Get data with Stylo
# data = stylo::load.corpus.and.parse(corpus.dir = "dh-meier-data/output/transkribus/tokenized/boudams/", features = "w", ngram.size = 1, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/transkr_expanded_words.csv")
data = read.csv("data/transkr_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)
Text lengths
nwords = colSums(data)
summary(nwords)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 298 2244 3539 5070 6774 18971
boxplot(nwords)
boxplot(nwords)$out

## 05_Ano_Leg-A_Ap_NA_Vie_Jacques 29_Wau_Leg-C_Co_Ev_Vie_Martin
## 17920 14432
## 31_Wau_Leg-C_Co_Ev_Dia_Martin3 34_Wau_Leg-C_Co_Ev_Vie_Martial
## 18971 15255
head(sort(nwords), n = 15)
## 03_Ano_Leg-A_Ap_NA_Mar_Jean 62_Ano_Leg-N_NA_NA_NA_Index
## 298 301
## 61_Ano_Leg-B_NA_NA_NA_Jugement 30_Wau_Leg-C_Co_Ev_Tra_Martin2
## 406 722
## 08_Ano_Leg-A_Ap_NA_Vie_Philippe 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie
## 1014 1293
## 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur 32_Wau_Leg-C_Co_Ev_Vie_Brice
## 1356 1385
## 60_Ano_Leg-B_NA_NA_NA_Antechriste 54_Ano_Leg-C_Vi_NA_Vie_Pelagie
## 1485 1506
## 20_Ano_Leg-B_Ma_Fe_Vie_Felicite 11_Ano_Leg-A_Ap_NA_Vie_Marc
## 1676 1820
## 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 53_Ano_Leg-C_Vi_NA_Vie_Marguerite
## 1894 1935
## 35_Wau_Leg-C_Co_Ev_Vie_Nicolas
## 1960
toKeep = colnames(data)[nwords > 1000]
toKeep = toKeep[grep("Bestiaire", toKeep, invert = TRUE)]
df = as.data.frame(nwords)
ggplot(df, aes(x="", y=nwords)) + geom_violin() + geom_boxplot(width=0.3) + theme(axis.text.y = element_text(size = rel(1.4)), axis.title = element_text(size = rel(1.4))) + xlab("Est. length in words of corpus texts") + scale_y_continuous(breaks=c(0, 2500, 5000, 7500, 10000, 12500, 15000, 17500))

Transkribus with linguistic annotation
POS 3-grams
data = read.csv("data/transkr_pos3-gr.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)
Burrows + vector-length norm
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
d = d[select,]
POS3grSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHPOS3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotPOS3grams = cahPlotCol(myCAH, k = 9, main = "POS 3-grams (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somCAHPOS3gr = somCAH
somplotPOS3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - POS 3-grams")
Lemmas
data = read.csv("data/transkr_lemmas.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)
Burrows + vector-length norm
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
LemmasSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHLemmas = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotLemmas = cahPlotCol(myCAH, k = 9, main = "Lemmas (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somCAHLemmas = somCAH
somplotLemmas = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Lemmas")
Function words from lemmas
# Find function words
#rownames(data)[1:250]
functionLemmas = source("functionLemmas.R")$value
Burrows + vector-length norm
d = relativeFreqs(data)
d = d[functionLemmas,]
FLSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFL = myCAH
# barplot(sort(myCAH$height))
plotFL = cahPlotCol(myCAH, k = 8, main = "Function Lemmas with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somCAHFL = somCAH
somplotFL = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words (lemmas)")
Affixes + POS 3-gr + Function words (lemmas)
data = rbind(AffixesSave, POS3grSave, FLSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHGlob = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotGlob = cahPlotCol(myCAH, k = 9, main = "Affixes + POS 3- grams + Function words (lemmas)")
somCAH = somCluster(d)
somCAHGlob = somCAH
somplotGlob = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Affixes + POS 3- grams + Function words (lemmas)")
Plots
Reference results on the three feature sets
gridExtra::grid.arrange(plotRaw3grams, somplotRaw3grams, plotGlob, somplotGlob, plotWordsLemmas, somplotWordsLemmas, ncol = 2, left = "Without SOM", right = "With SOM")

Analyses
#featlabel = "features of ME ±2σ with conf. > 90%"
#A = cahPlotCol(CAHLemma, main = "A", xlab = paste( ncol(CAHLemma$data), featlabel), k = 6, lrect = -12)
# B = cahPlotCol(CAHRhyme, main = "B", xlab = paste( ncol(CAHRhyme$data), featlabel), k = 6, lrect = -7, ylab = " ")
# C = cahPlotCol(CAHAllWords, main = "C", xlab = paste( ncol(CAHAllWords$data), featlabel), k = 6, ylab = " ")
# D = cahPlotCol(CAHAffs, main = "D", xlab = paste( ncol(CAHAffs$data), featlabel), k = 6, ylab = " ")
# E = cahPlotCol(CAHPOS3gr, main = "E", xlab = paste( ncol(CAHPOS3gr$data), featlabel), k = 6, lrect = -12 , ylab = " ")
# F = cahPlotCol(CAHmfw, main = "F", k = 6, lrect = -5, ylab = " ")
# gridExtra::grid.arrange(A, B, C, D, E, F, ncol = 2)
gridExtra::grid.arrange(plotRaw3grams, plotForms, plotAffixes, plotFW, plotLemmas, plotFL, plotPOS3grams, plotGlob, ncol = 2)

gridExtra::grid.arrange(somplotRaw3grams, somplotForms, somplotAffixes, somplotFW, somplotLemmas, somplotFL, somplotPOS3grams, somplotGlob, ncol = 2)

Robustness
cahList = list(raw3grams = CAHRaw3gr, Affs = CAHAffs, FunctLemm = CAHFL, POS3gr = CAHPOS3gr, FLPOSandAffs = CAHGlob, Forms = CAHForms, Lemmas = CAHLemmas, WordsLemmas = CAHWordsLemmas, UnnormFW = CAHFW)
#compareHC(cahList, k = 9)
benchmark = benchmarkHC(CAHRaw3gr, cahList, k = 9)
round(benchmark, digits = 2)
## N CPWauchier AC CPREF
## raw3grams 1276 0.98 0.63 1.00
## Affs 774 0.98 0.65 0.86
## FunctLemm 100 1.00 0.69 0.73
## POS3gr 328 0.97 0.68 0.68
## FLPOSandAffs 1202 0.98 0.62 0.83
## Forms 698 0.98 0.63 0.81
## Lemmas 512 0.97 0.59 0.73
## WordsLemmas 1210 0.98 0.62 0.85
## UnnormFW 171 0.98 0.72 0.81
# Now with SOM
cahSOMList = list(raw3grams = somCAHRaw3gr, Affs = somCAHAffs, FunctLemm = somCAHFL, POS3gr = somCAHPOS3gr, FLPOSandAffs = somCAHGlob, Forms = somCAHForms, Lemmas = somCAHLemmas, WordsLemmas = somCAHWordsLemmas, UnnormFW = somCAHFW)
benchmark = benchmarkHC(CAHRaw3gr, cahSOMList, k = 9)
round(benchmark, digits = 2)
## N CPWauchier AC CPREF
## raw3grams 100 0.98 0.76 0.92
## Affs 100 0.98 0.77 0.90
## FunctLemm 100 0.98 0.84 0.66
## POS3gr 100 0.98 0.80 0.64
## FLPOSandAffs 100 0.98 0.78 0.92
## Forms 100 0.98 0.79 0.83
## Lemmas 100 0.97 0.76 0.80
## WordsLemmas 100 0.98 0.76 0.80
## UnnormFW 100 0.97 0.84 0.80
Volatility index
#TODO: add SOMS ?
vol = volatility(cahList, k = 9)
out = merge(round(vol, digits = 2), nwords, by="row.names", all.x=TRUE, all.y=FALSE)
out[order(out[, "V_i"]), ]
## Row.names V_i y
## 44 45_Ano_Leg-C_Ap_NA_Pas_Andre2 -0.49 13315
## 28 28_Ano_Leg-B_Ma_Ho_Vie_Clement -0.48 2544
## 5 05_Ano_Leg-A_Ap_NA_Vie_Jacques -0.46 17920
## 11 11_Ano_Leg-A_Ap_NA_Vie_Marc -0.43 1820
## 40 41_Ano_Leg-C_Vi_NA_Vie_Irene -0.42 3145
## 59 60_Ano_Leg-B_NA_NA_NA_Antechriste -0.42 1485
## 42 43_Ano_Leg-C_Vi_NA_Vie_Catherine -0.29 8877
## 43 44_Ano_Leg-C_Ap_NA_Vie_Andre -0.29 3118
## 8 08_Ano_Leg-A_Ap_NA_Vie_Philippe -0.28 1014
## 9 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur -0.28 1356
## 26 26_Ano_Leg-B_Ma_Ev_Vie_Lambert -0.27 5247
## 45 46_Ano_Leg-B_Co_NA_Pur_Patrice -0.26 7872
## 46 47_Ano_Leg-C_Co_er_Vie_PaulErmite -0.26 3753
## 41 42_Ano_Leg-B_Vi_NA_Ass_NotreDame -0.25 3119
## 27 27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon -0.23 6565
## 57 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne -0.23 5529
## 13 13_Ano_Leg-B_Ma_Ho_Vie_Sebastien -0.22 3539
## 24 24_Ano_Leg-B_Ma_Ho_Vie_Laurent -0.21 3243
## 25 25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte -0.21 2513
## 23 23_Ano_Leg-B_Ma_Ho_Vie_Sixte -0.20 1894
## 14 14_Ano_Leg-B_Ma_Ho_Vie_Vincent -0.16 4838
## 19 19_Ano_Leg-B_Ma_Fe_Vie_Agnes -0.16 4177
## 50 51_Ano_Leg-C_Ma_ho_Vie_Eustache -0.14 3099
## 52 53_Ano_Leg-C_Vi_NA_Vie_Marguerite -0.11 1935
## 53 54_Ano_Leg-C_Vi_NA_Vie_Pelagie -0.11 1506
## 56 57_Ano_Leg-C_Vi_NA_Vie_Julien -0.06 2766
## 58 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie -0.06 1293
## 55 56_Ano_Leg-C_Co_NA_Vie_Mamertin -0.05 2202
## 49 50_Ano_Leg-C_NA_NA_Vie_Placide -0.02 2783
## 31 32_Wau_Leg-C_Co_Ev_Vie_Brice 0.00 1385
## 54 55_Ano_Leg-C_Co_NA_Vie_Simeon 0.00 2894
## 12 12_Ano_Leg-A_Ma_Ho_Vie_Longin 0.06 2244
## 22 22_Ano_Leg-B_Ma_Fe_Vie_Cecile 0.07 6782
## 47 48_Ano_Leg-C_Co_ev_Tra_Benoit2 0.07 3234
## 48 49_Ano_Leg-C_NA_NA_Vie_Maur 0.07 6310
## 51 52_Ano_Leg-C_Co_NA_Vie_Fursi 0.07 2492
## 20 20_Ano_Leg-B_Ma_Fe_Vie_Felicite 0.08 1676
## 6 06_Ano_Leg-A_Ap_NA_Vie_Matthieu 0.13 6447
## 7 07_Ano_Leg-A_Ap_NA_Vie_SimonJude 0.13 6784
## 10 10_Ano_Leg-A_Ap_NA_Vie_Barthelemy 0.13 4360
## 37 38_Wau_Leg-C_Co_Ev_Vie_Jerome 0.14 2425
## 16 16_Ano_Leg-B_Ma_Ho_Vie_Christophe 0.18 9122
## 17 17_Ano_Leg-B_Ma_Fe_Vie_Agathe 0.18 3109
## 18 18_Ano_Leg-B_Ma_Fe_Vie_Luce 0.18 2366
## 21 21_Ano_Leg-B_Ma_Fe_Vie_Christine 0.18 7481
## 1 00_Ano_Leg-A_Ap_Ev_Dis_Pierre1 0.19 6774
## 2 01_Ano_Leg-A_Ap_NA_Vie_Pierre2 0.19 5527
## 3 02_Ano_Leg-A_Ap_NA_Pas_Paul 0.19 4798
## 4 04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev 0.19 4955
## 15 15_Ano_Leg-B_Ma_Ho_Vie_Georges 0.20 4548
## 33 34_Wau_Leg-C_Co_Ev_Vie_Martial 0.44 15255
## 35 36_Wau_Leg-C_Co_Ev_Mir_Nicolas2 0.44 10473
## 36 37_Wau_Leg-C_Co_Ev_Tra_Nicolas3 0.44 8379
## 29 29_Wau_Leg-C_Co_Ev_Vie_Martin 0.59 14432
## 30 31_Wau_Leg-C_Co_Ev_Dia_Martin3 0.59 18971
## 34 35_Wau_Leg-C_Co_Ev_Vie_Nicolas 0.67 1960
## 39 40_Wau_Leg-C_Co_Er_Vie_Alexis 0.67 4103
## 32 33_Wau_Leg-C_Co_Er_Vie_Gilles 0.74 4415
## 38 39_Wau_Leg-C_Co_Ev_Vie_Benoit 0.74 12792
# see if there is a correlation
reg = lm(out[, 3] ~ out[, 2])
summary(reg)
##
## Call:
## lm(formula = out[, 3] ~ out[, 2])
##
## Residuals:
## Min 1Q Median 3Q Max
## -5974 -2715 -1025 1244 14641
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5173.8 520.6 9.939 4.71e-14 ***
## out[, 2] 4119.6 1648.0 2.500 0.0153 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3994 on 57 degrees of freedom
## Multiple R-squared: 0.09879, Adjusted R-squared: 0.08298
## F-statistic: 6.248 on 1 and 57 DF, p-value: 0.01533
plot(out[, 2], out[, 3])
abline(reg)
